# You need to run this block of code to import the libraries that are needed for the data manipulations, visualizations, etc.
from pydoc import help # can type in the python console `help(name of function)` to get the documentation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from scipy import stats
from IPython.display import display, HTML
# Read csv file into pandas dataframe
forest = pd.read_csv("ForestData.csv")
# Examine first few cases
forest.head()
# List the columns
list(forest.columns)
forest.dtypes
df = forest[forest['Ecosystem.element.assessed..soil.plant.microbe.fungi.fauna.'].isin(['Plant']) ]
df
a=['non-native' , 'alien species']
df = df[~df['Endpoint.Modifier'].isin(a)]
df
df = df[df['Endpoint'].isin(['shannon diversity', 'simpson diversity', 'species richness']) ]
df
# Getting rid of rows with missing data on the response variable
df = df[df['Response..Measurement'].notna()]
# Getting rid of rows with missing data on the response variable standard deviation
df = df[df['Response.Standard.Deviation'].notna()]
# Getting rid of rows with missing data on the response variable sample size
df = df[df['Response.Number.of.Observations'].notna()]
# Getting rid of rows with missing data on the control variable standard deviation
df = df[df['Control.Reference.Measurement'].notna()]
# Getting rid of rows with missing data on the response variable standard deviation
df = df[df['Control.Standard.Deviation'].notna()]
# Getting rid of rows with missing data on the response variable standard deviation
df = df[df['Control.Number.of.Observations'].notna()]
df
df['Year'] = df['Reference'].astype('str').str.extractall('(\d+)').unstack().fillna('').sum(axis=1).astype(int)
df['Year'] = df['Year'].astype(float).apply(lambda x:x if (x > 0) else 9999)
df[df['Year'] > 1995]
# Create a function that we can re-use
# This block of code will create a histogram. You don't have to use all of this code next time you want to create one
def show_distribution(var_data):
from matplotlib import pyplot as plt
# Get statistics
min_val = var_data.min()
max_val = var_data.max()
mean_val = var_data.mean()
med_val = var_data.median()
mod_val = var_data.mode()[0]
print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,
mean_val,
med_val,
mod_val,
max_val))
# Create a figure for 2 subplots (2 rows, 1 column)
fig, ax = plt.subplots(2, 1, figsize = (10,4))
# Plot the histogram
ax[0].hist(var_data)
ax[0].set_ylabel('Frequency')
# Add lines for the mean, median, and mode
ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)
# Plot the boxplot
ax[1].boxplot(var_data, vert=False)
ax[1].set_xlabel('Value')
# Add a title to the Figure
fig.suptitle('Data Distribution')
# Show the figure
fig.show()
col = df['Response..Measurement']
# Call the function
show_distribution(col)
# Get the variable to examine
col = df['Control.Reference.Measurement']
# Call the function
show_distribution(col)
# Exploring relationships - this code will create scatter plots
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
# selecting numerical features
features = ['Response..Measurement', 'Control.Reference.Measurement', 'Response.Standard.Deviation', 'Control.Standard.Deviation']
# plotting the scatter matrix
scatter_matrix(df[features], figsize=(12,12))
plt.xticks(rotation=90)
plt.show()
# If you want to compare groups based on one or more quantitative variables, use this code
col_list = ['Management.Method', 'Response..Measurement', 'Control.Reference.Measurement']
short_df = df[col_list]
import seaborn as sns
rs=1999
df_long = pd.melt(short_df.sample(150,random_state=rs), "Management.Method", var_name="Columns", value_name="Values")
f,ax = plt.subplots(figsize=(16,8))
#plt.xticks(rotation=90)
plt.ylim(0, 125)
#plt.xlim(0, None)
sns.boxplot(x="Columns", y="Values", hue="Management.Method", data=df_long)
df['ts2'] = df['Response.Standard.Deviation']**2
df['cs2'] = df['Control.Standard.Deviation']**2
df['term1'] = ((df['Response.Number.of.Observations'] - 1) * (df['ts2']))
df['term2'] = ((df['Control.Number.of.Observations'] - 1) * (df['cs2']))
df['numerator'] = df['term1'] + df['term2']
df['denominator'] = df['Response.Number.of.Observations'] + df['Control.Number.of.Observations'] - 2
df['divided'] = df['numerator'] / df['denominator']
df['pooledVar']=df['divided']**(1/2)
df['target'] = ((df['Response..Measurement'] - df['Control.Reference.Measurement']) / (df['pooledVar']))
# Getting rid of rows with missing data on the response variable
df = df[df['target'].notna()]
df
# If you want to compare groups based on one or more quantitative variables, use this code
col_list = ['Management.Method', 'target']
short_df = df[col_list]
import seaborn as sns
rs=1999
df_long = pd.melt(short_df.sample(150,random_state=rs), "Management.Method", var_name="Management Method", value_name="Effect Size")
f,ax = plt.subplots(figsize=(16,8))
#plt.xticks(rotation=90)
plt.ylim(-10, 10)
#plt.xlim(0, None)
sns.boxplot(x="Management Method", y="Effect Size", hue="Management.Method", data=df_long)
# If you want to compare groups based on one or more quantitative variables, use this code
col_list = ['Ecosystem.element.assessed..soil.plant.microbe.fungi.fauna.', 'target']
short_df = df[col_list]
import seaborn as sns
rs=1999
df_long = pd.melt(short_df.sample(150,random_state=rs), "Ecosystem.element.assessed..soil.plant.microbe.fungi.fauna.", var_name="Columns", value_name="Values")
f,ax = plt.subplots(figsize=(16,8))
#plt.xticks(rotation=90)
plt.ylim(-10, 10)
#plt.xlim(0, None)
sns.boxplot(x="Columns", y="Values", hue="Ecosystem.element.assessed..soil.plant.microbe.fungi.fauna.", data=df_long)
#Keeping only the columns we want
cols_to_keep = ['Reference',
'Habitat.Type',
'Management.Method',
'Management.Effect.Type',
'Ecosystem.element.assessed..soil.plant.microbe.fungi.fauna.',
'Endpoint',
'Measurement.Unit',
'Endpoint.Modifier',
'Magnitude.or.degree.of.change',
'Magnitude.or.degree.of.change..text.',
'Response..Measurement',
'Response.Number.of.Observations',
'Response.Standard.Deviation',
'Control.Reference.Measurement',
'Control.Number.of.Observations',
'Control.Standard.Deviation',
'general.area',
'Location',
'Year',
'target']
df2=df[cols_to_keep]
#Create dummy variables
combine = pd.concat([df2,pd.get_dummies(df['Management.Method'], prefix='ID')],axis=1)
combine.columns
#Create dummy variables
combine2 = pd.concat([combine,pd.get_dummies(combine['Habitat.Type'], prefix='ID')],axis=1)
combine2.columns
combine3 = pd.concat([combine2,pd.get_dummies(combine2['Management.Effect.Type'], prefix='ID')],axis=1)
combine3.columns
combine4 = pd.concat([combine3,pd.get_dummies(combine3['Location'], prefix='ID')],axis=1)
combine4.columns
combine5 = pd.concat([combine4,pd.get_dummies(combine4['Endpoint'], prefix='ID')],axis=1)
combine5.columns
combine5.head(10)
#conda install graphviz
#conda update -n base -c defaults conda
# Creating a list of features for the decision tree
# Include in this list, all the predictor variables that you want to use in your decision tree
features=['ID_Carbon Addition',
'ID_Prescribed Burn',
'ID_Thinning',
'ID_Coniferous Forest',
'ID_Deciduous Forest',
'ID_Forest',
'ID_community composition',
'ID_shannon diversity',
'ID_simpson diversity',
'ID_species richness',
]
target=['target']
# This block of code runs the decision tree analysis
# Replace TargetVar with the name of the target or dependent variable of interest
# max_depth and max_leaf_nodes are hyper parameters that are set by the researcher, they limit the size of the tree
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn import datasets, tree
y = combine5["target"]
X = combine5[features]#.astype(float)
dt = DecisionTreeRegressor(min_samples_split=10, min_samples_leaf=10, random_state=99, max_depth=5, max_leaf_nodes=7)
dt.fit(X, y)
text_representation = tree.export_text(dt)
print(text_representation)
with open("decistion_tree.log", "w") as fout:
fout.write(text_representation)
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt,
feature_names=features,
class_names=target,
filled=True)
# If you want to compare groups based on one or more quantitative variables, use this code
col_list = ['Habitat.Type', 'target']
short_df = df[col_list]
import seaborn as sns
rs=1999
df_long = pd.melt(short_df.sample(150,random_state=rs), "Habitat.Type", var_name="Habitat Type", value_name="Effect Size")
f,ax = plt.subplots(figsize=(16,8))
#plt.xticks(rotation=90)
plt.ylim(-10, 10)
#plt.xlim(0, None)
sns.boxplot(x="Habitat Type", y="Effect Size", hue="Habitat.Type", data=df_long)
#conda install -c plotly plotly=5.10.0
# selecting rows based on condition
richness = df[df['Endpoint'] == 'species richness']
richness.rename(columns = {'target': 'Effect Size', 'Management.Method':'Management Method', 'Habitat.Type':'Habitat Type'}, inplace = True)
richness
import plotly.express as px
px.box(richness, x="Effect Size", y="Management Method", orientation="h", color="Habitat Type", notched=True,
category_orders={"Management Method": ["Carbon Addition", "Prescribed Burn", "Thinning"]})
# selecting rows based on condition
shannon = df[df['Endpoint'] == 'shannon diversity']
shannon.rename(columns = {'target': 'Effect Size', 'Management.Method':'Management Method', 'Habitat.Type':'Habitat Type'}, inplace = True)
richness
px.box(shannon, x="Effect Size", y="Management Method", orientation="h", color="Habitat Type", notched=True,
category_orders={"Management Method": ["Carbon Addition", "Prescribed Burn", "Thinning"]})
# selecting rows based on condition
simpson = df[df['Endpoint'] == 'simpson diversity']
simpson.rename(columns = {'target': 'Effect Size', 'Management.Method':'Management Method', 'Habitat.Type':'Habitat Type'}, inplace = True)
richness
px.box(simpson, x="Effect Size", y="Management Method", orientation="h", color="Habitat Type", notched=True,
category_orders={"Management Method": ["Carbon Addition", "Prescribed Burn", "Thinning"]})